# Kernel: sgemm_nt_128x128

<CONSTANT_MAPPING>
  addr_zero  : 4x<128*8*4>

  gridDimA : c[0x0][0x14]
  gridDimB : c[0x0][0x18]

  param_A[0]  : c[0x0][0x140]
  param_A[1]  : c[0x0][0x144]
  param_B[0]  : c[0x0][0x148]
  param_B[1]  : c[0x0][0x14c]
  param_C[0]  : c[0x0][0x150]
  param_C[1]  : c[0x0][0x154]
  param_alpha : c[0x0][0x158]
  param_beta  : c[0x0][0x15c]
  param_lda   : c[0x0][0x160]
  param_ldb   : c[0x0][0x164]
  param_ldc   : c[0x0][0x168]
  param_m     : c[0x0][0x16c]
  param_n     : c[0x0][0x170]
  param_k     : c[0x0][0x174]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

  64-95 ~ blkA, blkB, blkZ, tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, tid127, txa, txb, xmad_ta, xmad_tb, tid128

  0-63 : czero<00-63>

   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7

  64-67 : j0Ay<0-3>
  68-71 : j0Bx<0-3>
  72-75 : j0Ay<4-7>
  76-79 : j0Bx<4-7>
  80-83 : j1Ay<0-3>
  84-87 : j1Bx<0-3>
  88-91 : j1Ay<4-7>
  92-95 : j1Bx<4-7>

  96-103 : loadA<0-3>,  loadB<0-3>
  112-115 : trackA<0-1>, trackB<0-1>

  116-122 ~ writeS, k, tidY, ta, tb, loop
  123-127 ~ readAs, readBs, tid, k_and
  128-135 ~ tmp_data, tmp_shl, tmp_param0, tmp_param1
  144-150 ~ k1, k2, k3

  64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
  64-75 : c<0-7>, d3, d2, d1, d0
  76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
  86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags

</REGISTER_MAPPING>

-:-:-:-:00 S2R tid,  SR_TID.X;
-:-:-:-:00 S2R blkA, SR_CTAID.Y;
-:-:-:-:00 S2R blkB, SR_CTAID.Z;
-:-:-:-:00 S2R blkZ, SR_CTAID.X;

-:-:-:-:00 MOV k,  param_k;
-:-:-:-:00 MOV ldaz, RZ;
-:-:-:-:00 MOV ldbz, RZ;
-:-:-:-:00 MOV ldcz, RZ;
-:-:-:-:00 LOP.AND tid1, tid,  1;

-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
<CODE>
  join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15);
</CODE>
-:-:-:-:00 MOV lda, param_lda;
-:-:-:-:00 MOV ldb, param_ldb;

// tidY  = tid1 << 2
-:-:-:-:00 SHL tidY, tid1, 2;

// tidX = tid >> 1
-:-:-:-:00 SHR.U32 tidX, tid, 1;

// trackA += 4 * ((blkA * 128 + tidX) * lda + tidY)
-:-:-:-:00 ISCADD txa, blkA, tidX, 7;
-:-:-:-:00 IMAD ta, lda, txa, tidY;
-:-:-:-:00 IMAD ta, ldaz, blkZ, ta;
-:-:-:-:00 MOV tmp_param0, param_A[0];
-:-:-:-:00 MOV tmp_param1, param_A[1];
-:-:-:-:00 SHL tmp_shl, ta, 0x2;
-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0;
-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1;

// trackB += 4 * ((blkB * 128 + tidX) * ldb + tidY)
-:-:-:-:00 ISCADD txb, blkB, tidX, 7;
-:-:-:-:00 IMAD tb, ldb, txb, tidY;
-:-:-:-:00 IMAD tb, ldbz, blkZ, tb;
-:-:-:-:00 MOV tmp_param0, param_B[0];
-:-:-:-:00 MOV tmp_param1, param_B[1];
-:-:-:-:00 SHL tmp_shl, tb, 0x2;
-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0;
-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1;

-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;

// writeS = 4 * (128 * tidY + tidX)
-:-:-:-:00 ISCADD writeS, tidY, tidX, 7;
-:-:-:-:00 SHL    writeS, writeS, 2;

-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;

// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4
-:-:-:-:00 LOP.AND readAs, tid,    0x70;
-:-:-:-:00 SHR.U32 readAs, readAs, 3;
-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
-:-:-:-:00 SHL     readAs, readAs, 4;

// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
-:-:-:-:00 LOP.AND tid128, tid,  128;
-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
-:-:-:-:00 SHR.U32 readBs, tid128, 4;
-:-:-:-:00 LOP.OR  readBs, readBs, tid7;
-:-:-:-:00 ISCADD  readBs, readBs, 4x<128*8>, 4;

-:-:-:-:00 NOP; 
-:-:-:-:00 NOP; 

REMAINDER:

<CODE>
    return q{
      -:-:-:-:00 IADD k1, tidY, 1;
      -:-:-:-:00 IADD k2, tidY, 2;
      -:-:-:-:00 IADD k3, tidY, 3;

      -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P5;
      -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P5;
      -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P5;
      -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P5;

      -:-:-:-:00 @P0 LD.E.CI loadA0, [trackA + 4x<0>];
      -:-:-:-:00 @P1 LD.E.CI loadA1, [trackA + 4x<1>];
      -:-:-:-:00 @P2 LD.E.CI loadA2, [trackA + 4x<2>];
      -:-:-:-:00 @P3 LD.E.CI loadA3, [trackA + 4x<3>];

      -:-:-:-:00 @!P0 MOV loadA0, RZ;
      -:-:-:-:00 @!P1 MOV loadA1, RZ;
      -:-:-:-:00 @!P2 MOV loadA2, RZ;
      -:-:-:-:00 @!P3 MOV loadA3, RZ;

      -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P6;
      -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P6;
      -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P6;
      -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P6;

      -:-:-:-:00 @P0 LD.E.CI loadB0, [trackB + 4x<0>];
      -:-:-:-:00 @P1 LD.E.CI loadB1, [trackB + 4x<1>];
      -:-:-:-:00 @P2 LD.E.CI loadB2, [trackB + 4x<2>];
      -:-:-:-:00 @P3 LD.E.CI loadB3, [trackB + 4x<3>];

      -:-:-:-:00 @!P0 MOV loadB0, RZ;
      -:-:-:-:00 @!P1 MOV loadB1, RZ;
      -:-:-:-:00 @!P2 MOV loadB2, RZ;
      -:-:-:-:00 @!P3 MOV loadB3, RZ;

      // bDoRemainder = k & 7 && k > 8
      -:-:-:-:00 LOP.AND k_and, k, 7;
      -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT;

      -:G:-:-:15 STS [writeS + 4x<0*128>], loadA0;
      -:G:-:-:15 STS [writeS + 4x<1*128>], loadA1;
      -:G:-:-:15 STS [writeS + 4x<2*128>], loadA2;
      -:G:-:-:15 STS [writeS + 4x<3*128>], loadA3;

      -:G:-:-:15 STS [writeS + 4x< 8*128>], loadB0;
      -:G:-:-:15 STS [writeS + 4x< 9*128>], loadB1;
      -:G:-:-:15 STS [writeS + 4x<10*128>], loadB2;
      -:G:-:-:15 STS [writeS + 4x<11*128>], loadB3;

      -:-:-:-:00 IADD   trackA0.CC, trackA0, 4x<8>;
      -:-:-:-:00 IADD.X trackA1, trackA1, RZ;

      -:-:-:-:00 IADD   trackB0.CC, trackB0, 4x<8>;
      -:-:-:-:00 IADD.X trackB1, trackB1, RZ;

      -:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>;
      -:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>;
      -:-:-:-:00 BAR.SYNC 0;
      -:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;

      -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1;
    };
</CODE>

<CODE>
    our %insert =
    (
        j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, 16, P5;\n",
        j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, 16, P6;\n",
        j0c61 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, 16, PT;\n",
        j0c62 => "-:-:-:-:00 \@P2 LD.E.CI loadA0, [trackA + 4x<0>];\n",

        j1c47 => "-:-:-:-:00 \@P2 LD.E.CI loadA1, [trackA + 4x<1>];\n",
        j1c53 => "-:-:-:-:00 \@P2 LD.E.CI loadA2, [trackA + 4x<2>];\n",
        j1c61 => "-:-:-:-:00 \@P2 LD.E.CI loadA3, [trackA + 4x<3>];\n",
        j1c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB0, [trackB + 4x<0>];\n",

        j2c47 => "-:-:-:-:00 \@P3 LD.E.CI loadB1, [trackB + 4x<1>];\n",
        j2c53 => "-:-:-:-:00 IADD32I k, k, -8;\n",
        j2c61 => "-:-:-:-:00 \@P3 LD.E.CI loadB2, [trackB + 4x<2>];\n",
        j2c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB3, [trackB + 4x<3>];\n",

        j3c47 => "-:-:-:-:00 \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
        j3c53 => "-:-:-:-:00 \@P2 IADD.X trackA1,    trackA1, RZ;\n",

        j4c47 => "-:-:D:S:02 \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
        j4c53 => "-:-:D:S:02 \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
        j4c61 => "-:-:D:-:07 \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
        j4c62 => "-:-:D:-:07 \@P0 STS [writeS + 4x<3*128>], loadA3;\n",

        j5c47 => "-:-:D:S:02 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
        j5c53 => "-:-:D:S:02 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
        j5c61 => "-:-:D:-:07 \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
        j5c62 => "-:-:D:-:07 \@P0 STS [writeS + 4x<11*128>], loadB3;\n",

        j6c47 => "-:-:-:-:00 \@P3 IADD   trackB0.CC, trackB0, 4x<8>;\n",
        j6c53 => "-:-:-:-:00 \@P3 IADD.X trackB1,    trackB1, RZ;\n",

        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n",
        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n",
        j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n",

        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n",

        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n".
                 "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n",
    );
    return;
</CODE>

<INCLUDE file="sgemm_common_128x128.sass"/>

